In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from pathlib import Path
import re

try:
    plt.style.use('seaborn-whitegrid')
except OSError:
    print("Style 'seaborn-whitegrid' not found, trying 'ggplot'.")
    plt.style.use('ggplot')

sns.set_context("talk")

SEASONS_STR = "2022_2024"
BASE_PROCESSED_DATA_DIR = Path("../data/new/processed_data/")

processed_files_pattern = f"*_stats_sentiment_{SEASONS_STR}.csv"
player_files_info = []

for f_path in BASE_PROCESSED_DATA_DIR.glob(processed_files_pattern):
    filename = f_path.name
    match = re.match(rf"(.+)_stats_sentiment_{SEASONS_STR}\.csv", filename)
    if match:
        player_slug = match.group(1)
        player_name_display = player_slug.replace('_', ' ').title()
        player_files_info.append({
            "slug": player_slug,
            "display_name": player_name_display,
            "file_path": f_path
        })
    else:
        match_plus = re.match(rf"(.+)_stats_plus_sentiment_{SEASONS_STR}\.csv", filename)
        if match_plus:
            player_slug = match_plus.group(1)
            player_name_display = player_slug.replace('_', ' ').title()
            player_files_info.append({
                "slug": player_slug,
                "display_name": player_name_display,
                "file_path": f_path
            })
        match_fullsnap = re.match(rf"(.+)_stats_plus_sentiment_fullsnap_{SEASONS_STR}\.csv", filename)
        if match_fullsnap:
            player_slug = match_fullsnap.group(1)
            player_name_display = player_slug.replace('_', ' ').title()
            player_files_info.append({
                "slug": player_slug,
                "display_name": player_name_display,
                "file_path": f_path
            })


if not player_files_info:
    print(f" No processed player files found in {BASE_PROCESSED_DATA_DIR} matching the pattern '{processed_files_pattern}' or similar variations.")
    print("Please ensure 'process_all_players.py' has run and generated files like 'playername_stats_sentiment_2022_2024.csv'")
else:
    print(f"Found {len(player_files_info)} players to process:")
    for p_info in player_files_info:
        print(f"  - {p_info['display_name']} (File: {p_info['file_path'].name})")
Found 8 players to process:
  - Anthony Edwards (File: anthony_edwards_stats_sentiment_2022_2024.csv)
  - Donovan Mitchell (File: donovan_mitchell_stats_sentiment_2022_2024.csv)
  - Giannis Antetokounmpo (File: giannis_antetokounmpo_stats_sentiment_2022_2024.csv)
  - Jalen Brunson (File: jalen_brunson_stats_sentiment_2022_2024.csv)
  - Lebron James (File: lebron_james_stats_sentiment_2022_2024.csv)
  - Luka Doncic (File: luka_doncic_stats_sentiment_2022_2024.csv)
  - Shai Gilgeous-Alexander (File: shai_gilgeous-alexander_stats_sentiment_2022_2024.csv)
  - Stephen Curry (File: stephen_curry_stats_sentiment_2022_2024.csv)
In [2]:
# High-level Goal: Unsupervised exploration of NBA player performance and associated Reddit sentiment.
# This notebook will iterate through available processed player data, performing
# data loading, preprocessing, visualization, and correlation analysis.
print("Project Goal: Automated analysis for multiple players.")
Project Goal: Automated analysis for multiple players.
In [3]:
def load_and_preprocess_data(file_path, player_name_display):
    """Loads and preprocesses the combined stats and sentiment data for a player."""
    if not file_path.exists():
        print(f" ERROR: Data file not found at {file_path} for {player_name_display}")
        return None

    print(f"\n--- Loading data for {player_name_display} from {file_path.name} ---")
    df = pd.read_csv(file_path)
    print(f"Initial rows loaded: {len(df)}")

    if 'game_date' not in df.columns:
        # Fallback if 'game_date' was not correctly named/saved by previous script, try 'GAME_DATE'
        if 'GAME_DATE' in df.columns:
            print("Using 'GAME_DATE' and converting to datetime.")
            df['game_date'] = pd.to_datetime(df['GAME_DATE'], errors='coerce')
        else:
            print(f" ERROR: Neither 'game_date' nor 'GAME_DATE' column found for {player_name_display}.")
            return None
    else:
        df['game_date'] = pd.to_datetime(df['game_date'], errors='coerce')
    
    df.dropna(subset=['game_date'], inplace=True)

    if 'WL' in df.columns:
        df['WIN'] = df['WL'].apply(lambda x: 1 if x == 'W' else 0 if x == 'L' else np.nan)
        df.dropna(subset=['WIN'], inplace=True)
        df['WIN'] = df['WIN'].astype(int)
    else:
        print(f"⚠️ WL column not found for {player_name_display}. Cannot create WIN feature.")
        df['WIN'] = np.nan # Add WIN column as NaN if WL is missing

    print(f"Rows before filtering on post_count: {len(df)}")
    if 'post_count' in df.columns:
        df_filtered = df[df['post_count'] > 0].copy()
        print(f"Rows after filtering for post_count > 0: {len(df_filtered)}")
        if len(df_filtered) == 0:
            print(f"⚠️ No games found with post_count > 0 for {player_name_display}. Sentiment analysis will be limited.")
            # Return the original df if no posts, so at least stats can be analyzed if desired
            # For this project, we require sentiment, so we'll return None if no sentiment games
            return None 
        print(f"Filtered data to {len(df_filtered)} games with sentiment (post_count > 0).")
    else:
        print(f" ERROR: 'post_count' column not found for {player_name_display}. Cannot filter for games with sentiment.")
        return None

    if {'mean_sentiment', 'post_count'}.issubset(df_filtered.columns):
        df_filtered['sent_intensity'] = (
            df_filtered['mean_sentiment'] * df_filtered['post_count']
        )
    else:
        df_filtered['sent_intensity'] = np.nan
        
    return df_filtered

print("Helper function 'load_and_preprocess_data' defined.")
Helper function 'load_and_preprocess_data' defined.
In [4]:
all_players_data_list = []
if not player_files_info:
    print("No player files were found to process. Halting execution.")
else:
    for player_info in player_files_info:
        PLAYER_SLUG = player_info["slug"]
        PLAYER_NAME_DISPLAY = player_info["display_name"]
        input_file_path = player_info["file_path"]

        print(f"\n\n{'='*20} PROCESSING: {PLAYER_NAME_DISPLAY} {'='*20}")
        
        df_player = load_and_preprocess_data(input_file_path, PLAYER_NAME_DISPLAY)

        if df_player is None or df_player.empty:
            print(f"--- Skipping further analysis for {PLAYER_NAME_DISPLAY} due to missing data or no games with sentiment ---")
            continue

        print(f"\n--- Univariate Distributions for {PLAYER_NAME_DISPLAY} ---")
        performance_features_to_plot = ['PTS', 'AST', 'REB', 'PLUS_MINUS', 'FG_PCT']
        sentiment_features_to_plot = ['mean_sentiment', 'pos_share', 
                                      'neg_share' if 'neg_share' in df_player.columns else 'min_sentiment', 
                                      'post_count', 'avg_delta_days']
        
        plt.figure(figsize=(18, 10))
        plot_idx = 1
        for col_list, sup_title_part in [(performance_features_to_plot, "Performance"), (sentiment_features_to_plot, "Sentiment Metrics")]:
            for col in col_list:
                if col in df_player.columns and not df_player[col].isnull().all():
                    plt.subplot(2, max(len(performance_features_to_plot), len(sentiment_features_to_plot)), plot_idx)
                    sns.histplot(df_player[col].dropna(), kde=True, bins=15)
                    plt.title(f'Distribution of {col}')
                    plt.xlabel(col)
                    plt.ylabel('Frequency')
                    plot_idx +=1
                else:
                    print(f"  Skipping histogram for missing/empty column: {col}")
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.suptitle(f'Univariate Distributions for {PLAYER_NAME_DISPLAY}', y=1.00, fontsize=16)
        plt.show()

        print(f"\n--- Correlation Analysis for {PLAYER_NAME_DISPLAY} ---")
        numerical_stats_cols = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'PLUS_MINUS', 'WIN', 'MIN']
        numerical_sentiment_cols = ['mean_sentiment', 'min_sentiment', 'max_sentiment', 'pos_share', 
                                    'neg_share' if 'neg_share' in df_player.columns else None, 
                                    'post_count', 'avg_delta_days', 'min_delta_days', 'max_delta_days']
        numerical_sentiment_cols = [col for col in numerical_sentiment_cols if col is not None]

        correlation_features = [col for col in numerical_stats_cols if col in df_player.columns and pd.api.types.is_numeric_dtype(df_player[col])] + \
                               [col for col in numerical_sentiment_cols if col in df_player.columns and pd.api.types.is_numeric_dtype(df_player[col])]
        
        df_corr = df_player[list(set(correlation_features))].copy()
        
        for col in df_corr.columns:
            df_corr[col] = pd.to_numeric(df_corr[col], errors='coerce')
        df_corr.dropna(inplace=True)

        if len(df_corr) > 1 and len(df_corr.columns) > 1:
            correlation_matrix = df_corr.corr()
            plt.figure(figsize=(18, 15))
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, annot_kws={"size": 7})
            plt.title(f'Correlation Matrix for {PLAYER_NAME_DISPLAY}', fontsize=16)
            plt.xticks(rotation=45, ha='right', fontsize=8)
            plt.yticks(rotation=0, fontsize=8)
            plt.tight_layout()
            plt.show()
        else:
            print("  ⚠️ Not enough data or columns for correlation matrix.")

        print(f"\n--- Unsupervised Clustering for {PLAYER_NAME_DISPLAY} ---")

        cluster_features = [
            'PTS', 'AST', 'REB', 'PLUS_MINUS',
            'mean_sentiment', 'sent_intensity'
        ]
        cluster_features = [c for c in cluster_features if c in df_player.columns]

        X = df_player[cluster_features].fillna(0)
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        pca = PCA(n_components=2, random_state=42)
        X_pca = pca.fit_transform(X_scaled)
        print("   • PCA two‑component variance:",
              round(pca.explained_variance_ratio_.sum(), 3))

        inertias = []
        for k in range(1, 8):
            inertias.append(
                KMeans(n_clusters=k, n_init=10, random_state=42)
                .fit(X_scaled).inertia_
            )
        plt.figure(figsize=(4, 3))
        plt.plot(range(1, 8), inertias, marker='o')
        plt.title('Elbow for K‑means'); plt.xlabel('k'); plt.ylabel('Inertia')
        plt.show()

        k = 3
        kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
        df_player['cluster'] = kmeans.fit_predict(X_scaled)

        plt.figure(figsize=(7, 6))
        sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1],
                        hue=df_player['cluster'], palette='Set2', s=80)
        plt.title(f'PCA space – {PLAYER_NAME_DISPLAY}')
        plt.xlabel('PC1'); plt.ylabel('PC2')
        plt.legend(title='Cluster')
        plt.show()

        profile_cols = cluster_features + (['WIN'] if 'WIN' in df_player.columns else [])
        print("Cluster medians:")
        display(
            df_player.groupby('cluster')[profile_cols]
                     .median()
                     .round(2)
        )

        print(f"\n--- Bivariate Scatter Plots for {PLAYER_NAME_DISPLAY} ---")
        key_relationships = [
            ('PTS', 'mean_sentiment'), ('PLUS_MINUS', 'mean_sentiment'),
            ('PTS', 'post_count'), ('WIN', 'mean_sentiment'),
            ('FG_PCT', 'mean_sentiment'), ('mean_sentiment', 'avg_delta_days')
        ]
        plt.figure(figsize=(20, 10))
        plot_idx = 1
        for x_col, y_col in key_relationships:
            if x_col in df_player.columns and y_col in df_player.columns:
                plt.subplot(2, 3, plot_idx)
                if x_col == 'WIN' and not df_player[x_col].isnull().all():
                    sns.boxplot(x=df_player[x_col].astype(int), y=df_player[y_col].dropna(), palette={0: 'red', 1: 'green'})
                    plt.xticks([0,1], ['Loss', 'Win'])
                elif not df_player[x_col].isnull().all() and not df_player[y_col].isnull().all():
                    sns.scatterplot(data=df_player, x=x_col, y=y_col, hue='WIN' if 'WIN' in df_player.columns else None, 
                                    palette={0: 'red', 1: 'green'} if 'WIN' in df_player.columns else None, alpha=0.7)
                    try:
                        sns.regplot(data=df_player, x=x_col, y=y_col, scatter=False, color='blue', line_kws={'linestyle':'--'})
                    except ValueError:
                        print(f"Could not plot regression line for {x_col} vs {y_col} (possibly due to low variance or NaNs).")
                plt.title(f'{x_col} vs. {y_col}')
                plot_idx += 1
            else:
                 print(f"  Skipping bivariate plot: missing {x_col} or {y_col}")
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.suptitle(f'Key Bivariate Relationships for {PLAYER_NAME_DISPLAY}', y=1.00, fontsize=18)
        plt.show()

        if 'WIN' in df_player.columns and not df_player['WIN'].isnull().all():
            print(f"\n--- Sentiment Analysis by Game Outcome for {PLAYER_NAME_DISPLAY} ---")
            plt.figure(figsize=(18, 6))
            sentiment_metrics_for_outcome = ['mean_sentiment', 'pos_share', 
                                             'neg_share' if 'neg_share' in df_player.columns else 'min_sentiment', 
                                             'post_count']
            plot_idx = 1
            for metric in sentiment_metrics_for_outcome:
                if metric in df_player.columns and not df_player[metric].isnull().all():
                    plt.subplot(1, len(sentiment_metrics_for_outcome), plot_idx)
                    sns.boxplot(x='WIN', y=metric, data=df_player, palette={0: 'salmon', 1: 'lightgreen'})
                    plt.title(f'{metric} by Outcome')
                    plt.xticks([0, 1], ['Loss', 'Win'])
                    plt.xlabel('Game Outcome')
                    plt.ylabel(metric.replace('_', ' ').title())
                    plot_idx +=1
                else:
                    print(f"  Skipping outcome plot for missing/empty column: {metric}")
            plt.tight_layout(rect=[0, 0, 1, 0.96])
            plt.suptitle(f'Sentiment Metrics by Game Outcome for {PLAYER_NAME_DISPLAY}', y=1.00, fontsize=16)
            plt.show()
        else:
            print(f"  Skipping sentiment by game outcome for {PLAYER_NAME_DISPLAY} (WIN column missing or all NaN).")

        if df_player is not None and not df_player.empty:
            df_player_copy = df_player.copy() # Make a copy
            df_player_copy['player_slug'] = PLAYER_SLUG # Add a column to identify the player
            all_players_data_list.append(df_player_copy)
        print(f"\n--- Finished processing {PLAYER_NAME_DISPLAY} ---")

if all_players_data_list:
    df_all_players_combined = pd.concat(all_players_data_list, ignore_index=True)
    print(f"\n\n✅ Combined data for {len(all_players_data_list)} players into df_all_players_combined ({len(df_all_players_combined)} total game rows with sentiment).")

else:
    print("⚠️ No data was accumulated for combined analysis. 'df_all_players_combined' will not be created.")
    df_all_players_combined = pd.DataFrame() # Create an empty df to avoid errors in next cell

==================== PROCESSING: Anthony Edwards ====================

--- Loading data for Anthony Edwards from anthony_edwards_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 237
Rows before filtering on post_count: 237
Rows after filtering for post_count > 0: 27
Filtered data to 27 games with sentiment (post_count > 0).

--- Univariate Distributions for Anthony Edwards ---
No description has been provided for this image
--- Correlation Analysis for Anthony Edwards ---
No description has been provided for this image
--- Unsupervised Clustering for Anthony Edwards ---
   • PCA two‑component variance: 0.568
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 27.5 5.0 5.0 0.5 0.67 0.75 1.0
1 15.0 7.0 13.0 -3.5 0.54 1.08 1.0
2 36.0 2.0 6.0 -2.0 0.03 0.04 0.0
--- Bivariate Scatter Plots for Anthony Edwards ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Anthony Edwards ---
No description has been provided for this image
--- Finished processing Anthony Edwards ---


==================== PROCESSING: Donovan Mitchell ====================

--- Loading data for Donovan Mitchell from donovan_mitchell_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 194
Rows before filtering on post_count: 194
Rows after filtering for post_count > 0: 24
Filtered data to 24 games with sentiment (post_count > 0).

--- Univariate Distributions for Donovan Mitchell ---
No description has been provided for this image
--- Correlation Analysis for Donovan Mitchell ---
No description has been provided for this image
--- Unsupervised Clustering for Donovan Mitchell ---
   • PCA two‑component variance: 0.56
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 12.5 4.5 3.0 5.5 0.73 0.73 0.5
1 30.0 6.0 5.0 15.5 0.66 0.85 1.0
2 27.0 7.0 6.5 5.0 -0.02 -0.02 0.5
--- Bivariate Scatter Plots for Donovan Mitchell ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Donovan Mitchell ---
No description has been provided for this image
--- Finished processing Donovan Mitchell ---


==================== PROCESSING: Giannis Antetokounmpo ====================

--- Loading data for Giannis Antetokounmpo from giannis_antetokounmpo_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 203
Rows before filtering on post_count: 203
Rows after filtering for post_count > 0: 24
Filtered data to 24 games with sentiment (post_count > 0).

--- Univariate Distributions for Giannis Antetokounmpo ---
No description has been provided for this image
--- Correlation Analysis for Giannis Antetokounmpo ---
No description has been provided for this image
--- Unsupervised Clustering for Giannis Antetokounmpo ---
   • PCA two‑component variance: 0.623
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 33.5 4.5 12.0 -6.5 0.66 0.66 0.0
1 26.0 10.0 12.0 22.0 0.66 0.66 1.0
2 27.0 6.0 10.0 1.0 -0.73 -0.73 0.0
--- Bivariate Scatter Plots for Giannis Antetokounmpo ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Giannis Antetokounmpo ---
No description has been provided for this image
--- Finished processing Giannis Antetokounmpo ---


==================== PROCESSING: Jalen Brunson ====================

--- Loading data for Jalen Brunson from jalen_brunson_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 210
Rows before filtering on post_count: 210
Rows after filtering for post_count > 0: 24
Filtered data to 24 games with sentiment (post_count > 0).

--- Univariate Distributions for Jalen Brunson ---
No description has been provided for this image
--- Correlation Analysis for Jalen Brunson ---
No description has been provided for this image
--- Unsupervised Clustering for Jalen Brunson ---
   • PCA two‑component variance: 0.583
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 24.0 7.0 3.0 10.0 0.60 0.95 1.0
1 24.0 5.0 1.5 -15.0 0.66 0.66 0.0
2 37.0 9.0 3.0 -1.0 0.67 0.67 1.0
--- Bivariate Scatter Plots for Jalen Brunson ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Jalen Brunson ---
No description has been provided for this image
--- Finished processing Jalen Brunson ---


==================== PROCESSING: Lebron James ====================

--- Loading data for Lebron James from lebron_james_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 196
Rows before filtering on post_count: 196
Rows after filtering for post_count > 0: 21
Filtered data to 21 games with sentiment (post_count > 0).

--- Univariate Distributions for Lebron James ---
No description has been provided for this image
--- Correlation Analysis for Lebron James ---
No description has been provided for this image
--- Unsupervised Clustering for Lebron James ---
   • PCA two‑component variance: 0.535
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 33.0 8.0 8.0 8.0 0.69 1.48 1.0
1 22.0 9.0 7.0 -1.0 0.20 0.20 1.0
2 25.0 7.0 6.0 -1.0 0.78 0.80 1.0
--- Bivariate Scatter Plots for Lebron James ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Lebron James ---
No description has been provided for this image
--- Finished processing Lebron James ---


==================== PROCESSING: Luka Doncic ====================

--- Loading data for Luka Doncic from luka_doncic_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 186
Rows before filtering on post_count: 186
Rows after filtering for post_count > 0: 15
Filtered data to 15 games with sentiment (post_count > 0).

--- Univariate Distributions for Luka Doncic ---
No description has been provided for this image
--- Correlation Analysis for Luka Doncic ---
No description has been provided for this image
--- Unsupervised Clustering for Luka Doncic ---
   • PCA two‑component variance: 0.635
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 31.5 7.0 8.0 7.0 0.66 0.88 1.0
1 21.0 12.0 11.5 10.5 0.59 1.03 1.0
2 14.0 4.0 5.0 15.0 0.06 0.12 1.0
--- Bivariate Scatter Plots for Luka Doncic ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Luka Doncic ---
No description has been provided for this image
--- Finished processing Luka Doncic ---


==================== PROCESSING: Shai Gilgeous-Alexander ====================

--- Loading data for Shai Gilgeous-Alexander from shai_gilgeous-alexander_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 219
Rows before filtering on post_count: 219
Rows after filtering for post_count > 0: 31
Filtered data to 31 games with sentiment (post_count > 0).

--- Univariate Distributions for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Correlation Analysis for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Unsupervised Clustering for Shai Gilgeous-Alexander ---
   • PCA two‑component variance: 0.545
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 25.0 8.0 3.0 8.0 0.29 0.29 1.0
1 33.0 6.0 5.0 16.0 0.66 0.67 1.0
2 22.0 4.0 5.0 -6.0 0.74 0.82 0.0
--- Bivariate Scatter Plots for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Finished processing Shai Gilgeous-Alexander ---


==================== PROCESSING: Stephen Curry ====================

--- Loading data for Stephen Curry from stephen_curry_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 200
Rows before filtering on post_count: 200
Rows after filtering for post_count > 0: 27
Filtered data to 27 games with sentiment (post_count > 0).

--- Univariate Distributions for Stephen Curry ---
No description has been provided for this image
--- Correlation Analysis for Stephen Curry ---
No description has been provided for this image
--- Unsupervised Clustering for Stephen Curry ---
   • PCA two‑component variance: 0.545
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 31.0 8.0 5.0 15.0 0.41 0.52 1.0
1 17.0 5.0 1.0 -2.0 0.71 1.96 1.0
2 25.0 5.0 5.0 6.0 0.73 0.78 1.0
--- Bivariate Scatter Plots for Stephen Curry ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Stephen Curry ---
No description has been provided for this image
--- Finished processing Stephen Curry ---


✅ Combined data for 8 players into df_all_players_combined (193 total game rows with sentiment).
In [5]:
# ===================================================================================
# CELL 5: GRAND AGGREGATE ANALYSIS - VISUALIZING OVERALL TRENDS ACROSS ALL PLAYERS
# ===================================================================================

if 'df_all_players_combined' in locals() and not df_all_players_combined.empty:
    print("\n\n📊 GRAND AGGREGATE VISUALIZATIONS & ANALYSIS (Across All Processed Players) 📊")

    # --- 5.0 Helper sets (ensure columns exist in the combined dataframe) ---
    all_cols = df_all_players_combined.columns
    
    perf_cols  = [col for col in ['PTS', 'AST', 'REB', 'PLUS_MINUS', 'FG_PCT', 'WIN', 'MIN'] if col in all_cols]
    senti_cols = [col for col in ['mean_sentiment', 'pos_share', 
                                  'neg_share' if 'neg_share' in all_cols else 'min_sentiment', 
                                  'post_count', 'avg_delta_days', 'sent_intensity'] if col in all_cols]
    
    key_pairs_template  = [
        ('PTS', 'mean_sentiment'), ('PLUS_MINUS', 'mean_sentiment'),
        ('PTS', 'post_count'), ('WIN', 'mean_sentiment'),
        ('FG_PCT', 'mean_sentiment'), ('mean_sentiment', 'avg_delta_days'),
        ('PTS', 'sent_intensity'), ('PLUS_MINUS', 'sent_intensity')
    ]
    key_pairs = [(x,y) for x,y in key_pairs_template if x in all_cols and y in all_cols]

    # --- 5.1 Distributions of Player-Level Means ---
    # Calculate mean per player for key metrics
    if 'player_slug' in df_all_players_combined.columns:
        player_means_perf = df_all_players_combined.groupby('player_slug')[perf_cols].mean()
        player_means_senti = df_all_players_combined.groupby('player_slug')[senti_cols].mean()

        fig, axes = plt.subplots(1, 2, figsize=(18, 7))
        
        # Performance player means distribution
        if not player_means_perf.empty:
            player_means_perf.plot(kind='box', ax=axes[0], vert=False)
            axes[0].set_title('Distribution of Player Average Performance Metrics')
            axes[0].set_xlabel('Average Value')
        else:
            axes[0].set_title('No Player Average Performance Data')

        # Sentiment player means distribution
        if not player_means_senti.empty:
            player_means_senti.plot(kind='box', ax=axes[1], vert=False)
            axes[1].set_title('Distribution of Player Average Sentiment Metrics')
            axes[1].set_xlabel('Average Value')
        else:
            axes[1].set_title('No Player Average Sentiment Data')
            
        plt.suptitle("Distributions of Player-Level Averages (Each player is a data point)", fontsize=16, y=1.02)
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.show()
    else:
        print("⚠️ 'player_slug' column not found in df_all_players_combined. Cannot create player-level mean distributions.")

    # --- 5.2 Scatter Plots of All Game Data (Colored by Player if < N players, or with overall trend) ---
    print("\n--- Key Relationship Scatter Plots (All Games, All Players) ---")
    num_players = df_all_players_combined['player_slug'].nunique() if 'player_slug' in df_all_players_combined.columns else 1
    use_player_hue = num_players <= 10 and 'player_slug' in df_all_players_combined.columns # Only use hue if few players

    plt.figure(figsize=(20, 15 if len(key_pairs) > 3 else 10))
    for idx, (x_col, y_col) in enumerate(key_pairs):
        plt.subplot( (len(key_pairs) + 2) // 3, 3, idx + 1) # Dynamic subplot grid
        
        if x_col == 'WIN':
            sns.boxplot(data=df_all_players_combined, x=x_col, y=y_col, palette={0: 'salmon', 1: 'lightgreen'})
            plt.xticks([0,1], ['Loss', 'Win'])
            plt.title(f'Overall: {y_col} by Game Outcome')
        else:
            hue_arg = 'player_slug' if use_player_hue else None
            sns.scatterplot(data=df_all_players_combined, x=x_col, y=y_col, hue=hue_arg, 
                            alpha=0.5, s=30, legend='auto' if use_player_hue else False)
            
            # Overall regression line
            sns.regplot(data=df_all_players_combined, x=x_col, y=y_col, scatter=False, color='black', 
                        line_kws={'linestyle':'-', 'linewidth': 1.5}, label="Overall Trend")
            
            plt.title(f'Overall: {x_col} vs. {y_col}')
            plt.xlabel(x_col)
            plt.ylabel(y_col)
            if use_player_hue:
                 plt.legend(title='Player', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.suptitle("Key Relationships Across All Player Data", fontsize=18, y=1.00)
    plt.show()

    # --- 5.3 Overall Sentiment by Game Outcome (Aggregated Bar Chart) ---
    if 'WIN' in df_all_players_combined.columns and 'mean_sentiment' in df_all_players_combined.columns:
        print("\n--- Overall Mean Sentiment by Game Outcome (All Players) ---")
        plt.figure(figsize=(8, 6))
        outcome_summary = df_all_players_combined.groupby('WIN')['mean_sentiment'].agg(['mean', 'count'])
        
        if 0 in outcome_summary.index and 1 in outcome_summary.index:
            outcome_summary['mean'].rename({0: f"Loss (N={outcome_summary.loc[0,'count']})", 
                                            1: f"Win (N={outcome_summary.loc[1,'count']})"}).plot(kind='bar', color=['salmon', 'lightgreen'])
            plt.ylabel('Average Mean Sentiment')
            plt.title('Overall Reddit Sentiment by Game Outcome')
            plt.xticks(rotation=0)
            plt.show()
        else:
            print("  ⚠️ Not enough data for both Wins and Losses to plot sentiment by outcome.")
            
    # --- 5.4 Table of Grand Means for Key Columns ---
    print("\n--- GRAND MEAN VALUES FOR KEY COLUMNS (All Players, All Games with Sentiment) ---")
    key_numeric_cols = [col for col in perf_cols + senti_cols if col in df_all_players_combined.columns and pd.api.types.is_numeric_dtype(df_all_players_combined[col])]
    grand_means = df_all_players_combined[key_numeric_cols].mean().round(3)
    
    # For better display in Jupyter
    from IPython.display import display
    print("Grand Means Table:")
    display(pd.DataFrame(grand_means, columns=['Grand Mean']))

else:
    print("⚠️ 'df_all_players_combined' is not defined or is empty. Cannot generate grand aggregate visuals.")

# The "All Players Processed!" print should be outside this cell, at the very end of your notebook's main execution flow.
print("\n\n🏁 All Players Processed! 🏁")

📊 GRAND AGGREGATE VISUALIZATIONS & ANALYSIS (Across All Processed Players) 📊
No description has been provided for this image
--- Key Relationship Scatter Plots (All Games, All Players) ---
No description has been provided for this image
--- Overall Mean Sentiment by Game Outcome (All Players) ---
No description has been provided for this image
--- GRAND MEAN VALUES FOR KEY COLUMNS (All Players, All Games with Sentiment) ---
Grand Means Table:
Grand Mean
PTS 27.891
AST 6.264
REB 6.218
PLUS_MINUS 5.378
FG_PCT 0.495
WIN 0.663
MIN 34.067
mean_sentiment 0.611
pos_share 0.943
neg_share 0.057
post_count 1.435
avg_delta_days 0.484
sent_intensity 0.870

🏁 All Players Processed! 🏁
In [ ]:
 
In [ ]: